madgraph5 · valassi · Oct 20, 2025 · Nov 22, 2025 · Nov 30, 2025 · Dec 6, 2025
diff --git a/epochX/cudacpp/.gitignore b/epochX/cudacpp/.gitignore
@@ -6,3 +6,5 @@ run_[0-9]*
 events.lhe*
 
 py3_model.pkl
+
+perf.data*
diff --git a/.../cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/color_sum.cc b/.../cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/color_sum.cc
@@ -3,9 +3,16 @@
 // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
 // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
 
+#include "mgOnGpuConfig.h"
+
+// For tests: disable autovectorization in gcc (in the cppnone mode only)
+//#ifndef MGONGPU_CPPSIMD
+//#pragma GCC optimize("no-tree-vectorize")
+//#endif
+
 #include "color_sum.h"
 
-#include "mgOnGpuConfig.h"
+#include "mgOnGpuVectorsSplitMerge.h"
 
 #include "MemoryAccessMatrixElements.h"
 
@@ -88,60 +95,69 @@ namespace mg5amcCpu
     // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
     // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
     // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-    fptype_sv deltaMEs = { 0 };
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    fptype_sv deltaMEs_next = { 0 };
-    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv deltaMEs2 = { 0 };
+#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT )
+    // Mixed mode: must convert from double to float and possibly merge SIMD vectors
+    // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust)
     fptype2_sv jampR_sv[ncolor];
     fptype2_sv jampI_sv[ncolor];
     for( int icol = 0; icol < ncolor; icol++ )
     {
+#if defined MGONGPU_CPPSIMD
+      // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector
       jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
       jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+#else
+      // Mixed mode without SIMD: convert double to float
+      // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust)
+      jampR_sv[icol] = cxreal( allJamp_sv[icol] );
+      jampI_sv[icol] = cximag( allJamp_sv[icol] );
+#endif
     }
 #else
+    // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower)
     const cxtype_sv* jamp_sv = allJamp_sv;
 #endif
     // Loop over icol
     for( int icol = 0; icol < ncolor; icol++ )
     {
       // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv& jampRi_sv = jampR_sv[icol];
-      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT )
+      const fptype2_sv& jampRi_sv = jampR_sv[icol];
+      const fptype2_sv& jampIi_sv = jampI_sv[icol];
 #else
-      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+      const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] );
+      const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] );
 #endif
       fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
       fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
       // Loop over jcol
       for( int jcol = icol + 1; jcol < ncolor; jcol++ )
       {
         // Off-diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRj_sv = jampR_sv[jcol];
-        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT )
+        const fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        const fptype2_sv& jampIj_sv = jampI_sv[jcol];
 #else
-        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+        const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] );
+        const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] );
 #endif
         ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
         ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
       }
-      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      deltaMEs += fpvsplit0( deltaMEs2 );
-      deltaMEs_next += fpvsplit1( deltaMEs2 );
-#else
-      deltaMEs += deltaMEs2;
-#endif
+      deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
     }
     // *** STORE THE RESULTS ***
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
     fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
     // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
     fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs = fpvsplit0( deltaMEs2 );
+    fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 );
+#else
+    fptype_sv deltaMEs = deltaMEs2;
+#endif
     MEs_sv += deltaMEs; // fix #435
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
     fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );

diff --git a/...acpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectors.h b/...acpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectors.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUVECTORS_H
 #define MGONGPUVECTORS_H 1
@@ -744,92 +744,6 @@ namespace mg5amcCpu
 
 #endif // #ifdef MGONGPU_CPPSIMD
 
-  //--------------------------------------------------------------------------
-
-  // Functions and operators for fptype2_v
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-
-  inline fptype2_v
-  fpvmerge( const fptype_v& v1, const fptype_v& v2 )
-  {
-    // This code is not very efficient! It makes mixed precision FFV/color not faster than double on C++ (#537).
-    // I considered various alternatives, including
-    // - in gcc12 and clang, __builtin_shufflevector (works with different vector lengths, BUT the same fptype...)
-    // - casting vector(4)double to vector(4)float and then assigning via reinterpret_cast... but how to do the cast?
-    // Probably the best solution is intrinsics?
-    // - see https://stackoverflow.com/questions/5139363
-    // - see https://stackoverflow.com/questions/54518744
-    /*
-    fptype2_v out;
-    for( int ieppV = 0; ieppV < neppV; ieppV++ )
-    {
-      out[ieppV] = v1[ieppV];
-      out[ieppV+neppV] = v2[ieppV];
-    }
-    return out;
-    */
-#if MGONGPU_CPPSIMD == 2
-    fptype2_v out =
-      { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] };
-#elif MGONGPU_CPPSIMD == 4
-    fptype2_v out =
-      { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] };
-#elif MGONGPU_CPPSIMD == 8
-    fptype2_v out =
-      { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] };
-#endif
-    return out;
-  }
-
-  inline fptype_v
-  fpvsplit0( const fptype2_v& v )
-  {
-    /*
-    fptype_v out = {}; // see #594
-    for( int ieppV = 0; ieppV < neppV; ieppV++ )
-    {
-      out[ieppV] = v[ieppV];
-    }
-    */
-#if MGONGPU_CPPSIMD == 2
-    fptype_v out =
-      { (fptype)v[0], (fptype)v[1] };
-#elif MGONGPU_CPPSIMD == 4
-    fptype_v out =
-      { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] };
-#elif MGONGPU_CPPSIMD == 8
-    fptype_v out =
-      { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] };
-#endif
-    return out;
-  }
-
-  inline fptype_v
-  fpvsplit1( const fptype2_v& v )
-  {
-    /*
-    fptype_v out = {}; // see #594
-    for( int ieppV = 0; ieppV < neppV; ieppV++ )
-    {
-      out[ieppV] = v[ieppV+neppV];
-    }
-    */
-#if MGONGPU_CPPSIMD == 2
-    fptype_v out =
-      { (fptype)v[2], (fptype)v[3] };
-#elif MGONGPU_CPPSIMD == 4
-    fptype_v out =
-      { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] };
-#elif MGONGPU_CPPSIMD == 8
-    fptype_v out =
-      { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] };
-#endif
-    return out;
-  }
-
-#endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-
 #endif // #ifndef MGONGPUCPP_GPUIMPL
 
   //==========================================================================
Original file line number	Diff line number	Diff line change
Expand Up		@@ -6,3 +6,5 @@ run_[0-9]*
		events.lhe*

		py3_model.pkl

		perf.data*